In [1]:
# -*-coding:utf-8-*-
%matplotlib inline
import os

import numpy as np
from matplotlib import pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from bald_latin import remove_cyrillic_and_accents as balden

Load the original and stemmed comments, and their labels.

Remove the cyrillic comments and remove accents from ć,č,ž,š,đ.


In [2]:
# comments collected by the "Lovac na sendvice" app
def load_comments_and_labels():
    lns_comments = balden(open('dataset/lns/lns_comments.txt', 'r').readlines())
    lns_stemmed = open('dataset/lns/lns_comments_stemmed.txt', 'r').readlines()
    lns_labels = open('dataset/lns/lns_labels.txt', 'r').readlines()
    
    # remove cyrillic and accents on stemmed comments
    lns_stemmed, lns_labels = balden(lns_stemmed, lns_labels)
    
    # labels as a numpy array
    lns_labels = np.array([int(float(x)) for x in lns_labels])
    
    assert len(lns_comments) == len(lns_stemmed)
    return lns_comments, lns_stemmed, lns_labels
   
def load_scraped():
    # scraped comments from Blic.rs
    scraped_comments, scraped_stemmed = balden(
        open('dataset/scraped/comments.txt').readlines(),
        open('dataset/scraped/comments_stemmed.txt').readlines())
    
    assert len(scraped_comments) == len(scraped_stemmed)
    return scraped_comments, scraped_stemmed

def load_scraped_not_category():
    scraped_nots_comments, scraped_nots_stemmed = balden(
        open('dataset/scraped/slobodno_vreme.txt').readlines(),
        open('dataset/scraped/slobodno_vreme_stemmed.txt').readlines())
    
    assert len(scraped_nots_comments) == len(scraped_nots_stemmed)
    return scraped_nots_comments, scraped_nots_stemmed

lns_comments, lns_stemmed, lns_labels = load_comments_and_labels()
print "Loaded LnS comments and labels"
scraped_comments, scraped_stemmed = load_scraped()
print "Loaded scraped comments"
scraped_nots_comments, scraped_nots_stemmed = load_scraped_not_category()
print "Loaded scraped nots"


Loaded LnS comments and labels
Loaded scraped comments
Loaded scraped nots

Create a vectorizer, add methods for testing predictions


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

def build_vectorizer():
    croatian_stop_words = set([u"a",u"ako",u"ali",u"bi",u"bih",u"bila",u"bili",u"bilo",u"bio",u"bismo",u"biste",u"biti",u"bumo",u"da",u"do",u"duž",u"ga",u"hoće",u"hoćemo",u"hoćete",u"hoćeš",u"hoću",u"i",u"iako",u"ih",u"ili",u"iz",u"ja",u"je",u"jedna",u"jedne",u"jedno",u"jer",u"jesam",u"jesi",u"jesmo",u"jest",u"jeste",u"jesu",u"jim",u"joj",u"još",u"ju",u"kada",u"kako",u"kao",u"koja",u"koje",u"koji",u"kojima",u"koju",u"kroz",u"li",u"me",u"mene",u"meni",u"mi",u"mimo",u"moj",u"moja",u"moje",u"mu",u"na",u"nad",u"nakon",u"nam",u"nama",u"nas",u"naš",u"naša",u"naše",u"našeg",u"ne",u"nego",u"neka",u"neki",u"nekog",u"neku",u"nema",u"netko",u"neće",u"nećemo",u"nećete",u"nećeš",u"neću",u"nešto",u"ni",u"nije",u"nikoga",u"nikoje",u"nikoju",u"nisam",u"nisi",u"nismo",u"niste",u"nisu",u"njega",u"njegov",u"njegova",u"njegovo",u"njemu",u"njezin",u"njezina",u"njezino",u"njih",u"njihov",u"njihova",u"njihovo",u"njim",u"njima",u"njoj",u"nju",u"no",u"o",u"od",u"odmah",u"on",u"ona",u"oni",u"ono",u"ova",u"pa",u"pak",u"po",u"pod",u"pored",u"prije",u"s",u"sa",u"sam",u"samo",u"se",u"sebe",u"sebi",u"si",u"smo",u"ste",u"su",u"sve",u"svi",u"svog",u"svoj",u"svoja",u"svoje",u"svom",u"ta",u"tada",u"taj",u"tako",u"te",u"tebe",u"tebi",u"ti",u"to",u"toj",u"tome",u"tu",u"tvoj",u"tvoja",u"tvoje",u"u",u"uz",u"vam",u"vama",u"vas",u"vaš",u"vaša",u"vaše",u"već",u"vi",u"vrlo",u"za",u"zar",u"će",u"ćemo",u"ćete",u"ćeš",u"ću",u"što"])

    # build tf-idf vectorizer which uses unigrams and bigrams.
    # uses words with 2+ occurances as features
    vectorizer = TfidfVectorizer(
        strip_accents="unicode",
        lowercase=True,
        ngram_range=(1, 2),
        min_df=10,
        norm='l2',
        smooth_idf=True,
        use_idf=True,
        stop_words=croatian_stop_words)
    
    return vectorizer

Once we have picked a threshold for bot classification, lets classify the scraped comments

With the scraped comments, add the scraped not-bots and train an LSTM


In [5]:
def classify_bots(text_train, y_train, unlabeled_stemmed, threshold=0.83):
    """
    Train the classifier on text_train and y_train,
    and label bots in the unlabeled stemmed comments, with probability above the threshold.
    Return indices of bot comments
    """
    # build the dataset, vectorize it using TF-IDF
    vectorizer = build_vectorizer()
    X_train = vectorizer.fit_transform(text_train)
    X_unlabeled = vectorizer.transform(unlabeled_stemmed)
        
    # create and fit the classifier
    clf = MultinomialNB().fit(X_train, y_train)
    
    # predict on the unlabeled set
    y_pred = clf.predict_proba(X_unlabeled)[:,1]
    # select comments with very high or low probabilities
    bot_indices = np.argwhere(y_pred > threshold)
    
    return bot_indices

def build_large_comment_set(lns_comments, lns_labels, scraped_bots, scraped_nots, weight=0.1):
    """
    Build dataset from the original unstemmed 'Lovac na Sendvice' comments, 
    newly classified bot comments, and manually tagged not comments.
    """
    combined_comments = lns_comments + scraped_bots + scraped_nots
    combined_labels = list(lns_labels) + list(np.ones(len(scraped_bots))) + list(np.zeros(len(scraped_nots)))
    combined_labels = np.array(combined_labels)
    # the weigths of the new samples are decreased
    combined_weights = np.ones(len(combined_labels))
    combined_weights[len(lns_comments):] = weight
    
    return combined_comments, combined_labels, combined_weights
 

def comments2matrix(comments):
    """
    Prepare the comments to be fed to the LSTM
    """
    def remove_symbols(comments):
        # replace characters, reduce set 
        bad_chars = ['\n', '\t', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

        def clean_comment(comment):
            comment = comment.encode('ascii', errors='ignore').lower()
            for bc in bad_chars:
                comment = comment.replace(bc, ' ')
            return comment

        return map(clean_comment, comments)

    def pad_comments(comments, size=100):
        """
        Pad each comment to a *size* characters. Longer comments get cut off.
        """
        def pad(comment):
            comment = comment[:size]
            comment = comment + " " * (size - len(comment)) 
            return comment

        return map(pad, comments)

    def one_hot(comments):
        #char_set = list(set("".join(comments)))
        char_set = set([' ', '1', '0', '3', '2', '5', '4', '7', '6', '9', '8', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])
        char_set_str = "".join(char_set)
        char_set_len = len(char_set)

        # TODO optimize? should use sparse matrices
        # Currently X is < 1GB, which is kinda ok
        X = np.zeros((len(comments), len(comments[0]), char_set_len))
        for comment_ind, comment in enumerate(comments):
            for char_ind, char in enumerate(comment):
                X[comment_ind, char_ind, char_set_str.find(char)] = 1

        return X 
    
    # remove symbols, pad, and vectorize
    return one_hot(pad_comments(remove_symbols(comments)))


def y_one_hot(y):
    """
    Two categories require two neurons in the output. Must convert y to a one-hot representation.
    """
    one_hot = np.zeros((len(y), 2))
    one_hot[np.arange(len(y)), np.round(y).astype(int)] = 1
    return one_hot

Label a dataset from the scraped comments and train the network on it

For each fold, train a separate network


In [6]:
def build_training_and_test_set(lns_comments, lns_stemmed, lns_labels, unlabeled_comments, unlabeled_stemmed, scraped_nots):
    
    ratio = 0.8
    # Split the dataset into k folds, each fold is a test set for one iteration
    #train = range(int(len(lns_comments) * ratio))
    #test = range(int(len(lns_comments) * ratio), len(lns_comments))
    from sklearn.cross_validation import train_test_split
    train, test = train_test_split(range(len(lns_comments)))
    
    text_train_stemmed = [lns_stemmed[x] for x in train]
    text_train_comments = [lns_comments[x] for x in train]
    text_test_stemmed  = [lns_stemmed[x] for x in test]
    text_test_comments  = [lns_comments[x] for x in test]
    y_train = [lns_labels[x] for x in train]
    y_test  = [lns_labels[x] for x in test]
        
    print("Built the training dataset")
        
    bot_indices = classify_bots(text_train_stemmed, y_train, unlabeled_stemmed)
    print("Classified bots using NB")
    
    # using the NB predictions, pull the bots from the original comments
    classified_bots = [unlabeled_comments[x] for x in bot_indices]
    # and take the same number of comments from the not categories
    classified_nots = scraped_nots[:len(classified_bots)]
        
    # build a large dataset from LnS comments and NB labeled comments. NB comments have lower weigths.
    comments_large, y_large, weigths_large = build_large_comment_set(
        text_train_comments, 
        y_train, 
        classified_bots, 
        classified_nots) 
    
    # shuffle text_train_comments and y_train
    from sklearn.utils import shuffle
    comments_large, y_large = shuffle(comments_large, y_large)
    
    y_large = y_one_hot(y_large)
    y_test = y_one_hot(y_test)
    print("Built the training set for the LSTM")

    X_large = comments2matrix(comments_large)
    X_test = comments2matrix(text_test_comments)
    print("Vectorized the training set")
    
    return X_large, y_large, X_test, y_test

    
max_scraped = 3*10**6  # can't wait
unlabeled_comments = scraped_comments[:max_scraped]
unlabeled_stemmed = scraped_stemmed[:max_scraped]

X_train, y_train, X_test, y_test = build_training_and_test_set(lns_comments, lns_stemmed, lns_labels, unlabeled_comments, unlabeled_stemmed, scraped_nots_comments)


Built the training dataset
Classified bots using NB
Built the training set for the LSTM
/home/mihailo/development/projects/botvis/.venv/lib/python2.7/site-packages/ipykernel/__main__.py:23: VisibleDeprecationWarning: converting an array with ndim > 0 to an index will result in an error in the future
Vectorized the training set

In [ ]:
def build_net(shape, nb_filter=64, pool_length=2, lstm_output_size=128):
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, Embedding, LSTM, GRU, Input, merge, Activation,Convolution1D, MaxPooling1D, Flatten, Convolution2D
    
    model = Sequential()
    model.add(Convolution1D(input_shape=(shape[1], shape[2]),
                            nb_filter=16, filter_length=11, border_mode='valid', activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=16, filter_length=11, border_mode='valid', activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Convolution1D(nb_filter=32, filter_length=9, border_mode='valid', activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=32, filter_length=9, border_mode='valid', activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(Convolution1D(nb_filter=64, filter_length=7, border_mode='valid', activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=64, filter_length=7, border_mode='valid', activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=pool_length))
    #model.add(Convolution1D(nb_filter=16, filter_length=5, border_mode='valid', activation='relu', subsample_length=1))
    #model.add(Convolution1D(nb_filter=16, filter_length=5, border_mode='valid', activation='relu', subsample_length=1))
    #model.add(MaxPooling1D(pool_length=pool_length))
    #model.add(Convolution1D(nb_filter=16, filter_length=3, border_mode='valid', activation='relu', subsample_length=1))
    #model.add(MaxPooling1D(pool_length=pool_length))
    #model.add(Flatten())
    #model.add(LSTM(lstm_output_size, return_sequences=True))
    #model.add(LSTM(64, input_shape=(shape[1], shape[0]), dropout_U=0.5, dropout_W=0.3, return_sequences=True))
    
    model.add(LSTM(64, dropout_U=0.5, dropout_W=0.3, return_sequences=True))
    #model.add(GRU(64, return_sequences=True, activation='softsign', input_shape=(shape[1], shape[2])))
    #model.add(Flatten())
    #model.add(Dropout(0.2))
    #model.add(GRU(10, return_sequences=False, activation='softsign'))
    model.add(LSTM(64, dropout_U=0.3, dropout_W=0.3, return_sequences=False))
    #model.add(GRU(32, return_sequences=False))
    model.add(Dense(20))
    model.add(Dense(2))
    # TODO replace with softmax
    model.add(Activation('softmax'))
    # try using different optimizers and different optimizer configs
    from keras.optimizers import SGD
    #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    #model.compile(sgd, 'binary_crossentropy', metrics=['accuracy'])
    model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
    
    return model


net = build_net(X_train.shape)
print("Built the net")

epoch = 100
batch_size = 64
max_samples = 20000

for ep in range(epoch):
    net.fit(X_train[:max_samples], y_train[:max_samples],
        batch_size=batch_size,
        nb_epoch=1,
        validation_data=[X_test, y_test])


Built the net

In [ ]: